







import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
import plotly.graph_objs as go
from datetime import date
from urllib.request import urlopen
import json
from jupyter_dash import JupyterDash
from dash.dependencies import Input, Output
import dash_html_components as html
import dash_core_components as dcc
import warnings
warnings.filterwarnings('ignore')
"""
Function to web scrape most up to date data from usafacts website so that the code
is always up to date with the most recent COVID-19 information
"""
def scrape_live_data():
# Load datasets dynamically from URLs
confirm_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv', header=0)
# , encoding = "ISO-8859-1"
pop_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv', header=0)
# Combine into one dataset
confirmed_cases = pd.merge(left=confirm_df, right=pop_df, how='left', left_on=['countyFIPS','State'], right_on=['countyFIPS','State'])
# Rename/Drop/Move columns
# get list of column names
all_columns = confirmed_cases.columns.values
# rename county Name_x with county name
all_columns[1] = 'County Name'
# make updated column names list columns of dataframe
confirmed_cases.columns = all_columns
# drop additional county names column added on merge
confirmed_cases.drop(labels=['County Name_y'], axis=1, inplace = True)
# save the population column so we can add back into df later
population_column = confirmed_cases['population']
# drop population column from dataframe
confirmed_cases.drop(labels=['population'], axis=1, inplace = True)
# insert population column back in as 3rd column in df
confirmed_cases.insert(3, 'population', population_column)
indexes = confirmed_cases.loc[[1835, 1862, 1863]].index
confirmed_cases.drop(indexes, inplace = True)
return confirmed_cases
def scrape_live_data_deaths():
# Load datasets dynamically from URLs
death_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv', header=0)
# , encoding = "ISO-8859-1"
pop_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv', header=0)
# Combine into one dataset
confirmed_deaths = pd.merge(left=death_df, right=pop_df, how='left', left_on=['countyFIPS','State'], right_on=['countyFIPS','State'])
# Rename/Drop/Move columns
# get list of column names
all_columns = confirmed_deaths.columns.values
# rename county Name_x with county name
all_columns[1] = 'County Name'
# make updated column names list columns of dataframe
confirmed_deaths.columns = all_columns
# drop additional county names column added on merge
confirmed_deaths.drop(labels=['County Name_y'], axis=1, inplace = True)
# save the population column so we can add back into df later
population_column = confirmed_deaths['population']
# drop population column from dataframe
confirmed_deaths.drop(labels=['population'], axis=1, inplace = True)
# insert population column back in as 3rd column in df
confirmed_deaths.insert(3, 'population', population_column)
indexes = confirmed_deaths.loc[[1835, 1862, 1863]].index
confirmed_deaths.drop(indexes, inplace = True)
return confirmed_deaths
def exploratoryDA(confirmed_cases):
#checking first few rows to get a sense of data
print("##############print first 10 rows:#################")
print(confirmed_cases.head(10))
#checking last ten rows
print("##############print last 10 rows:(##############")
print(confirmed_cases.tail(10))
#checking the data types of the columns
print("##############print column data types:##############")
print(confirmed_cases.dtypes)
#Gettign a sense of count
print("#############print column counts:##############")
print(confirmed_cases.count())
#checking for nulls
print("##############Check nulls:##############")
print(confirmed_cases.isnull().sum()) (edited)
#arrange columns
def createStatePOP(confirmed_cases):
#create a pivot table to group data by state to calcualte state population
df_statepop = confirmed_cases.pivot_table(['population'],['State'],aggfunc='sum').reset_index()
return df_statepop
def meltData(confirmed_cases):
#Unpivot date columns with variable column ‘Date’ and value column ‘Confirmed’
#saving all dates in a list to use for melting the data frame
dates = confirmed_cases.columns[6:]
df_confirm_long = confirmed_cases.melt(id_vars=['State'], value_vars=dates, var_name='Date', value_name='Confirmed')
df_confirm_long.to_csv('df_confirm_long.csv',index=True)
#make sure the confirmed cases are numeric and not string
df_confirm_long['Confirmed'] = pd.to_numeric(df_confirm_long['Confirmed'],errors='coerce')
#aggregate by state and get max confirmed cases to date
last_date= dates[-1]
df_confirm_long = df_confirm_long[df_confirm_long['Date']==last_date].groupby(['State'])['Confirmed'].sum().reset_index()
return df_confirm_long
#need to run after merge, arrange, melt
def createRate(df_statepop,df2_melt):
#merge both population and confirmed cases to calculate rate
df2_melt = pd.merge(left=df2_melt, right=df_statepop, how='left', left_on=['State'], right_on=['State'])
df2_melt['Rate'] = (df2_melt['Confirmed']/df2_melt['population'] ) *100
return df2_melt
#need to run after merge, arrange, melt, create rate
#Plot geo map for statewie rate = cases/popualtion
def createUSAMap(createRate):
map_data= dict(type='choropleth',
colorscale='Portland',
locations = createRate['State'],
locationmode ='USA-states',
z = createRate['Rate'],
text = createRate['State'],
colorbar = {'title': 'Cases/Population%'},
marker = dict(line = dict(color = 'rgb(255,255,255)', width=2))
)
map_layout= dict(title = 'Percentage of COVID-19 Confirmed Cases to Date by Popualtion of the State',
geo= dict(scope='usa',
showlakes=True, # lakes
lakecolor='rgb(255, 255, 255)'))
map_cases = go.Figure(map_data)
map_cases.update_layout(map_layout)
map_cases.show()
map_cases.write_html("state_rate.html")
#Storing data by Month to use for the heat map
#need to run merge before this
def structureDataTimeSeries(confirmed_cases):
#drop the columns we don't need and store in a new dataframe
df_confirm = confirmed_cases.drop(columns=['population'])
#Unpivot date columns with variable column ‘Date’ and value column ‘Confirmed’
dates = df_confirm.columns[5:]
#use melt function to make data long
df_confirm_long = df_confirm.melt(id_vars='State', value_vars=dates, var_name='Date', value_name='Confirmed')
#aggregating by date and state and resetting
df_confirm_long = df_confirm_long.groupby(['Date', 'State'])['Confirmed'].sum().reset_index()
#converting date strings to datetime
df_confirm_long['Date'] = pd.to_datetime(df_confirm_long['Date'])
return df_confirm_long
#need to run merge, arrange, structure before this
def createHeatMap(df_confirm_long):
##Getting months with dates
df_confirm_long['month'] = df_confirm_long['Date'].dt.month_name()
#We can also get data by day for further analysis if required.
#df_confirm['day'] = df_confirm['date'].dt.day_name()
#aggregating by month and resetting and storing in a new dataframe
df_month = df_confirm_long.groupby(['State', 'month'],sort=False)['Confirmed'].max().reset_index()
#saving the dataframe for back-up
df_month.to_csv('df_cases_months.csv',index=True)
#creating data for heatmap to show confirmed cases for each state by month
heatmap_data = pd.pivot_table(df_month, values='Confirmed', index='State', columns='month')
#verifying data
# print(heatmap_data.iloc[0:3, 0:3])
#store months in a list to use for sorting
months=df_month.month.unique().tolist()
#sort columns by months
heatmap_data = heatmap_data.reindex(columns=months)
#customize the heatmap
plt.figure(figsize=(200,200))
m = sns.heatmap(heatmap_data,cmap='RdBu_r', robust=True)
m.set_xticklabels(heatmap_data.columns, rotation=45)
m.set_yticklabels(heatmap_data.index, rotation=45)
m.set_title('Confirmed Cases by month for all the States in USA')
#save the heatmap
plt.savefig('heatmap.png', dpi=150)
plt.show()
plt.savefig("heatmapByState.png")
return plt
def createLinePlot(df_confirm_long):
# Initialize figure and ax
fig, ax = plt.subplots()
states=['TX','CA','FL','AZ','NY']
compare = df_confirm_long[df_confirm_long['State'].isin(states) ]
#set parameter markers to True and style='continent'
ax.set(yscale="log")
ax.set_title("COVID-19 Cases Trends for AZ, CA, FL, NY, TX")
sns.lineplot(x='Date', y='Confirmed', data=compare, hue='State', ax=ax, style = 'State', markers = True)
plt.show()
def plot_geomap_counties(cases, deaths):
# get the max cases for each county. Want to get the max because cases are aggregate, not single day,
# so cases on latest date is total number of cases thusfar
last_date= cases.columns.tolist()[-1]
max_cases = cases[['countyFIPS','County Name','State',f'{last_date}']]
deaths = deaths[['countyFIPS',f'{last_date}']]
deaths_dict = dict(zip(deaths['countyFIPS'], deaths[f'{last_date}']))
max_cases['Deaths'] = max_cases['countyFIPS'].map(deaths_dict)
# code to plot a heatmap of the number of cases using plotly
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
# need to add leading zero to all fips that have 4 characters, because that is how it is stored in df pulled with county fips
# create empty dictionary for new fips values
fips_new = {}
#make fips str so we can add leading zeros and make into list we can loop through
fips = cases['countyFIPS'].astype(str).tolist()
# loop through fips list and if length is 4 add a leading zero. Add all updated fips to dict, with key as old fip and
# value as new fip
for f in fips:
if len(f) == 4:
f_new = '0' + f
fips_new[f] = f_new
else:
fips_new[f] = f
# make fips str
max_cases['countyFIPS'] = max_cases['countyFIPS'].astype(str)
# map dictionary as new column so that we can use the new fips that we created
max_cases['fips'] = max_cases['countyFIPS'].map(fips_new)
# get all cases where county fip is not zero. Dont want statewide data in this instance
max_cases = max_cases.loc[max_cases['countyFIPS'] != 0]
# plot the heatmap of cases
fig = px.choropleth_mapbox(max_cases, geojson=counties, locations='fips', color=f'{last_date}',
# color_continuous_scale="tempo",
range_color=(1, 5000),
mapbox_style="carto-positron",
zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
opacity=0.5,
labels={f'{last_date}':'# Cases', 'County Name':'County: ', 'Deaths': '# Deaths'},
hover_data=['County Name', 'Deaths']
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
def plot_bar(cases, deaths):
# get the max cases for each county. Want to get the max because cases are aggregate, not single day,
# so cases on latest date is total number of cases thusfar
last_date= cases.columns.tolist()[-1]
max_cases = cases[['countyFIPS','County Name','State',f'{last_date}']]
deaths = deaths[['countyFIPS',f'{last_date}']]
deaths_dict = dict(zip(deaths['countyFIPS'], deaths[f'{last_date}']))
max_cases['Deaths'] = max_cases['countyFIPS'].map(deaths_dict)
state_counts = max_cases.groupby('State').sum().reset_index()
# rename the columns of each state
state_counts.columns = ['State', 'CountyFIPS','NumCases', 'NumDeaths']
# sort the state_counts by number of cases
state_counts.sort_values(by='NumCases')
# plot the number of cases in bar chart
#fig = px.bar(state_counts, x='State', y='NumCases')
#fig.update_layout(title=f'Number of COVID-19 Cases by State as of {last_date}')
#fig.show()
fig = go.Figure(data=[
go.Bar(name='# Cases', x=state_counts['State'], y=state_counts['NumCases'])
])
# Change the bar mode
fig.update_layout(barmode='group', title='Number of COVID-19 Cases by State')
fig.show()
fig2 = go.Figure(data=[
go.Bar(name='# Cases', x=state_counts['State'], y=state_counts['NumDeaths'])
])
# Change the bar mode
fig2.update_layout(barmode='group', title='Number of COVID-19 Deaths by State')
fig2.show()
"""
Function to get the differences in cases for the entire dataset
"""
def get_diff_by_state(cases, state_name):
# add try except block here. Also room for testing to make sure that correct state chosen
state = cases.loc[cases['State'] == str(state_name)]
# transpose dataframe so dates are rows
stateT = state.T
# make county names the columns for transposed data frame
stateT.columns=stateT.iloc[1]
# drop statewide counts
stateT = stateT.drop(columns='Statewide Unallocated')
# drop the first five rows because that is data we are not interested in in this instance
stateT = stateT.iloc[5:]
# create empty dictionary so we can store the difference in cases for each county in state
diff_counts = {}
for col in stateT:
# get the column and find the difference for each row
col_cases = stateT[col].diff()
# append new series to dictionary as value with key as county name
diff_counts[col] = col_cases
# create dataframe with differences from each state by day
diff_counties = pd.DataFrame.from_dict(diff_counts)
return diff_counties
"""
Function to get the differences in cases for the last week
"""
def get_diff_last_week(cases, state_name):
# add try except block here. Also room for testing to make sure that correct state chosen
state = cases.loc[cases['State'] == state_name]
# transpose dataframe so dates are rows
stateT = state.T
# make county names the columns for transposed data frame
stateT.columns=stateT.iloc[1]
# drop statewide counts
stateT = stateT.drop(columns='Statewide Unallocated')
# drop the first three rows because that is data we are not interested in in this instance
stateT = stateT.iloc[5:]
# create empty dictionary so we can store the difference in cases for each county in state
diff_counts = {}
for col in stateT:
# get the column and find the difference for each row
col_cases = stateT[col].diff()
# append new series to dictionary as value with key as county name
diff_counts[col] = col_cases
# create dataframe with differences from each state by day
diff_counties = pd.DataFrame.from_dict(diff_counts)
# get the date for today so that we can update our data for last week
today = pd.to_datetime(date.today())
# get the date for a week prior so that we can filter our datetime column
week = today - pd.Timedelta(days=14)
# reset index so that we can get the Date as a column
diff_counties = diff_counties.reset_index().rename(columns={'index':'Date'})
# convert the date to pandas datetime format
diff_counties['Date'] = pd.to_datetime(diff_counties['Date'], infer_datetime_format=True)
# get the data where the date is greater than the last week
diff_counties = diff_counties.loc[diff_counties['Date'] >= week]
diff_counties.index = pd.to_datetime(diff_counties['Date'], format = '%m/%d/%Y')
diff_counties.index = diff_counties.index.strftime('%Y-%m-%d')
diff_counties = diff_counties.drop(columns=['Date'])
return diff_counties
confirmed_cases = scrape_live_data()
confirmed_deaths = scrape_live_data_deaths()

#arrange columns to produce geo map
statePOP = createStatePOP(confirmed_cases)
#melt
meltData_df = meltData(confirmed_cases)
#create cases/popualtion to create map
createRate = createRate(statePOP,meltData_df)
#structure for heat and line plot
df_confirm_long = structureDataTimeSeries(confirmed_cases)




#geomap
createUSAMap(createRate)
plot_bar(confirmed_cases, confirmed_deaths)
#Line plot
createLinePlot(df_confirm_long)
#HEATMAP
createHeatMap(df_confirm_long)


plot_geomap_counties(confirmed_cases, confirmed_deaths)


app = JupyterDash(__name__)
app.layout = html.Div([
html.H3("New COVID Cases Per Day Demo"),
dcc.Graph(id='graph'),
html.Label([
"State",
dcc.Dropdown(
id='state-name', clearable=False,
value='AL', options=[
{'label': c, 'value': c}
for c in confirmed_cases['State'].unique().tolist()
])
]),
])
# Define callback to update graph
@app.callback(
Output('graph', 'figure'),
[Input("state-name", "value")]
)
def update_figure(state_name):
diff_counties = get_diff_by_state(confirmed_cases, state_name)
fig2 = go.Figure(data=go.Heatmap(
z=[diff_counties[i] for i in diff_counties.columns],
x=diff_counties.index.tolist(),
y=diff_counties.columns.tolist(),
colorscale='Viridis'))
return fig2
# Run app and display result inline in the notebook
app.run_server(mode='inline', port=8051)
app = JupyterDash(__name__)
app.layout = html.Div([
html.H1("New COVID Cases Per Day In the Last Two Week Demo"),
dcc.Graph(id='graph'),
html.Label([
"State",
dcc.Dropdown(
id='state-name', clearable=False,
value='AL', options=[
{'label': c, 'value': c}
for c in confirmed_cases['State'].unique().tolist()
])
]),
])
# Define callback to update graph
@app.callback(
Output('graph', 'figure'),
[Input("state-name", "value")]
)
def update_figure(state_name):
diff_counties = get_diff_last_week(confirmed_cases, state_name)
fig2 = go.Figure(data=go.Heatmap(
z=[diff_counties[i] for i in diff_counties.columns],
x=diff_counties.index.tolist(),
y=diff_counties.columns.tolist(),
colorscale='Viridis'))
return fig2
# Run app and display result inline in the notebook
app.run_server(mode='inline',port=8060)

import unittest
import datetime
from datetime import timedelta
# inherit from unittest.TestCase
class WebScraperTestCase(unittest.TestCase):
def setUp(self):
print("\n*** SETUP ***\n")
# Get the latest data
self.confirmed_cases = scrape_live_data()
def tearDown(self):
print("\n*** TEARDOWN ***\n")
# Remove the scraped data
del self.confirmed_cases
# TEST - 1 #
# Is scrape_live_data() method successfully getting oldest data for COVID cases
def test_1_is_oldest_data_scraped(self):
print("Verify web scraped data starts from January 22, 2020")
# Last date must be for yesterday (1 day ago) - reformatted without leading 0
first_day = datetime.datetime(2020, 1, 22).strftime("%x").lstrip("0").replace(" 0", " ")
self.assertEqual(self.confirmed_cases.columns[5], first_day)
# TEST - 2 #
# Is scrape_live_data() method successfully getting latest data for COVID cases
def test_2_is_latest_data_scraped(self):
print("Verify web scraped data is current (till yesterday)")
# Last date must be for yesterday (1 day ago) - reformatted without leading 0
yesterday = (date.today() - timedelta(days = 1)).strftime("%x").lstrip("0").replace(" 0", " ")
self.assertEqual(self.confirmed_cases.columns[-1], yesterday)
# TEST - 3 #
# Is scrape_live_data() method successfully getting required columns for data analysis
def test_3_is_required_data_scraped(self):
print("Verify web scraped data contains required columns")
required_columns = ["countyFIPS", "County Name", "State", "population", "stateFIPS"]
self.assertEqual(self.confirmed_cases.columns[0:5].tolist(), required_columns)
# TEST - 4 #
# Is createStatPOP() method returning data in required format
def test_4_is_population_data_formatted(self):
print("Verify state-wise population data is in required format")
state_population = createStatPOP(self.confirmed_cases)
headers = ["State", "population"]
self.assertEqual(state_population.columns.tolist(), headers)
# TEST - 5 #
# Is createStatPOP() method returning data for all 50 states
def test_5_is_population_data_for_all_states(self):
print("Verify state-wise population data exists for all 50 states")
state_population = createStatPOP(self.confirmed_cases)
self.assertEqual(len(state_population.index), 51)
# TEST - 6 #
# Is meltData() method returning data in required format
def test_6_is_pivot_data_formatted(self):
print("Verify date-wise data for each state is in required format")
melted_data = meltData(self.confirmed_cases)
headers = ["State", "Confirmed"]
self.assertEqual(melted_data.columns.tolist(), headers)
# TEST - 7 #
# Is structureDataTimeSeries() method returning data in required format
def test_7_is_time_series_data_formatted(self):
print("Verify time-series data is in required format")
time_series_data = structureDataTimeSeries(self.confirmed_cases)
headers = ["State", "month", "Confirmed"]
self.assertEqual(time_series_data.columns.tolist(), headers)
if __name__ == '__main__':
unittest.main(argv=['first-arg-is-ignored'], exit=False)

